Note and Instrument Classification¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa as librosa
import librosa.display
import os
In [2]:
# sets initial plot parameters for all of our plotting in this notebook
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
In [3]:
df=pd.read_csv('note_info.csv') # imports audio sample information dataframe
In [4]:
df.head()
Out[4]:
Unnamed: 0 note_str sample_rate qualities_str instrument_source instrument_family_str instrument_family note instrument_source_str qualities pitch instrument_str instrument velocity
0 keyboard_acoustic_004-060-025 keyboard_acoustic_004-060-025 16000 ['dark', 'reverb'] 0 keyboard 4 278915 acoustic [0, 1, 0, 0, 0, 0, 0, 0, 1, 0] 60 keyboard_acoustic_004 327 25
1 bass_synthetic_033-050-100 bass_synthetic_033-050-100 16000 ['dark'] 2 bass 0 270361 synthetic [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] 50 bass_synthetic_033 417 100
2 bass_synthetic_009-052-050 bass_synthetic_009-052-050 16000 ['bright', 'distortion', 'long_release'] 2 bass 0 270001 synthetic [1, 0, 1, 0, 1, 0, 0, 0, 0, 0] 52 bass_synthetic_009 150 50
3 keyboard_electronic_003-064-127 keyboard_electronic_003-064-127 16000 [] 1 keyboard 4 50978 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 64 keyboard_electronic_003 65 127
4 bass_synthetic_034-030-050 bass_synthetic_034-030-050 16000 ['distortion', 'tempo-synced'] 2 bass 0 265159 synthetic [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] 30 bass_synthetic_034 420 50
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12678 entries, 0 to 12677
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Unnamed: 0             12678 non-null  object
 1   note_str               12678 non-null  object
 2   sample_rate            12678 non-null  int64 
 3   qualities_str          12678 non-null  object
 4   instrument_source      12678 non-null  int64 
 5   instrument_family_str  12678 non-null  object
 6   instrument_family      12678 non-null  int64 
 7   note                   12678 non-null  int64 
 8   instrument_source_str  12678 non-null  object
 9   qualities              12678 non-null  object
 10  pitch                  12678 non-null  int64 
 11  instrument_str         12678 non-null  object
 12  instrument             12678 non-null  int64 
 13  velocity               12678 non-null  int64 
dtypes: int64(7), object(7)
memory usage: 1.4+ MB
In [6]:
len(df['pitch'].unique()) # identifies number of unique pitches 
Out[6]:
112
In [7]:
# defines function to extract various features from audio files. extraction functions provided by gist.github.com/gvyshnya

def extract_feature_means(audio_file_path: str) -> pd.DataFrame:
    # config settings
    number_of_mfcc = 20
    y, sr = librosa.load(audio_file_path)
    signal, _ = librosa.effects.trim(y)  
    n_fft = 512 # FFT window size
    hop_length = 256 # number audio of frames between STFT columns (looks like a good default)


    # Short-time Fourier transform (STFT)
    d_audio = np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length))

    # 3. Spectrogram
    # Convert an amplitude spectrogram to Decibels-scaled spectrogram.
    db_audio = librosa.amplitude_to_db(d_audio, ref=np.max)

    # 4. Create the Mel Spectrograms
    s_audio = librosa.feature.melspectrogram(signal, sr=sr)
    s_db_audio = librosa.amplitude_to_db(s_audio, ref=np.max)

    # 5 Zero crossings

    # #6. Harmonics and Perceptrual
    # Note:
    #
    # Harmonics are characteristichs that represent the sound color
    # Perceptrual shock wave represents the sound rhythm and emotion
    y_harm, y_perc = librosa.effects.hpss(signal)

    # 7. Spectral Centroid
    # Note: Indicates where the ”centre of mass” for a sound is located and is calculated
    # as the weighted mean of the frequencies present in the sound.

    # Calculate the Spectral Centroids
    spectral_centroids = librosa.feature.spectral_centroid(signal, sr=sr)[0]
    spectral_centroids_delta = librosa.feature.delta(spectral_centroids)
    spectral_centroids_accelerate = librosa.feature.delta(spectral_centroids, order=2)

    # spectral_centroid_feats = np.stack((spectral_centroids, delta, accelerate))  # (3, 64, xx)

    # 8. Chroma Frequencies¶
    # Note: Chroma features are an interesting and powerful representation
    # for music audio in which the entire spectrum is projected onto 12 bins
    # representing the 12 distinct semitones ( or chromas) of the musical octave.

    # Increase or decrease hop_length to change how granular you want your data to be
    hop_length = 256

    # Chromogram
    chromagram = librosa.feature.chroma_stft(signal, sr=sr, hop_length=hop_length)

    # 9. Tempo BPM (beats per minute)¶
    # Note: Dynamic programming beat tracker.

    # Create Tempo BPM variable
    tempo_y, _ = librosa.beat.beat_track(signal, sr=sr)

    # 10. Spectral Rolloff
    # Note: Is a measure of the shape of the signal. It represents the frequency below which a specified
    #  percentage of the total spectral energy(e.g. 85 %) lies.

    # Spectral RollOff Vector
    spectral_rolloff = librosa.feature.spectral_rolloff(signal, sr=sr)[0]

    # spectral flux
    onset_env = librosa.onset.onset_strength(y=signal, sr=sr)

    # Spectral Bandwidth¶
    # The spectral bandwidth is defined as the width of the band of light at one-half the peak
    # maximum (or full width at half maximum [FWHM]) and is represented by the two vertical
    # red lines and λSB on the wavelength axis.
    spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(signal, sr=sr)[0]
    spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(signal, sr=sr, p=3)[0]
    spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(signal, sr=sr, p=4)[0]

    audio_features = {
        "file_name": audio_file_path,
        "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(signal)[0]),
        "zero_crossings": np.sum(librosa.zero_crossings(signal, pad=False)),
        "spectrogram": np.mean(db_audio[0]),
        "mel_spectrogram": np.mean(s_db_audio[0]),
        "harmonics": np.mean(y_harm),
        "perceptual_shock_wave": np.mean(y_perc),
        "spectral_centroids": np.mean(spectral_centroids),
        "spectral_centroids_delta": np.mean(spectral_centroids_delta),
        "spectral_centroids_accelerate": np.mean(spectral_centroids_accelerate),
        "chroma1": np.mean(chromagram[0]),
        "chroma2": np.mean(chromagram[1]),
        "chroma3": np.mean(chromagram[2]),
        "chroma4": np.mean(chromagram[3]),
        "chroma5": np.mean(chromagram[4]),
        "chroma6": np.mean(chromagram[5]),
        "chroma7": np.mean(chromagram[6]),
        "chroma8": np.mean(chromagram[7]),
        "chroma9": np.mean(chromagram[8]),
        "chroma10": np.mean(chromagram[9]),
        "chroma11": np.mean(chromagram[10]),
        "chroma12": np.mean(chromagram[11]),
        "tempo_bpm": tempo_y,
        "spectral_rolloff": np.mean(spectral_rolloff),
        "spectral_flux": np.mean(onset_env),
        "spectral_bandwidth_2": np.mean(spectral_bandwidth_2),
        "spectral_bandwidth_3": np.mean(spectral_bandwidth_3),
        "spectral_bandwidth_4": np.mean(spectral_bandwidth_4),
    }

    # extract mfcc feature
    mfcc_df = extract_mfcc_feature_means(audio_file_path,
                                    signal,
                                    sample_rate=sr,
                                    number_of_mfcc=number_of_mfcc)

    df = pd.DataFrame.from_records(data=[audio_features])

    df = pd.merge(df, mfcc_df, on='file_name')

    return df

    # librosa.feature.mfcc(signal)[0, 0]

def extract_mfcc_feature_means(audio_file_name: str,
                          signal: np.ndarray,
                          sample_rate: int,
                          number_of_mfcc: int) -> pd.DataFrame:

    mfcc_alt = librosa.feature.mfcc(y=signal, sr=sample_rate,
                                    n_mfcc=number_of_mfcc)
    delta = librosa.feature.delta(mfcc_alt)
    accelerate = librosa.feature.delta(mfcc_alt, order=2)

    mfcc_features = {
        "file_name": audio_file_name,
    }

    for i in range(0, number_of_mfcc):
        # dict.update({'key3': 'geeks'})

        # mfcc coefficient
        key_name = "".join(['mfcc', str(i)])
        mfcc_value = np.mean(mfcc_alt[i])
        mfcc_features.update({key_name: mfcc_value})

        # mfcc delta coefficient
        key_name = "".join(['mfcc_delta_', str(i)])
        mfcc_value = np.mean(delta[i])
        mfcc_features.update({key_name: mfcc_value})

        # mfcc accelerate coefficient
        key_name = "".join(['mfcc_accelerate_', str(i)])
        mfcc_value = np.mean(accelerate[i])
        mfcc_features.update({key_name: mfcc_value})

    df = pd.DataFrame.from_records(data=[mfcc_features])
    return df
In [8]:
path = r"C:\Users\ksivi\Desktop\New folder\nsynth-valid\audio" # defines path for audio files
dir_list = os.listdir(path) # creates list of files found in path
In [9]:
len(dir_list)
Out[9]:
12676
In [10]:
# extracts features from audio files, appends to 'info' list

info = []

for i in dir_list:
    try:
        data = extract_feature_means("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+i)
    except:
        continue
    values = data.values
    info.append(values[0])
In [11]:
df_2 = extract_feature_means('bass_electronic_018-022-025.wav') # extracts features from single audio file
In [12]:
df_2.head()
Out[12]:
file_name zero_crossing_rate zero_crossings spectrogram mel_spectrogram harmonics perceptual_shock_wave spectral_centroids spectral_centroids_delta spectral_centroids_accelerate ... mfcc_accelerate_16 mfcc17 mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19
0 bass_electronic_018-022-025.wav 0.210125 15742 -66.413757 -77.642357 0.000006 -0.000673 2671.302747 22.499443 1.72652 ... 0.046337 -0.739872 -0.046959 0.051725 -0.290447 -0.08219 0.018868 2.694299 -0.056343 -0.000569

1 rows × 88 columns

In [13]:
columns = df_2.columns # pulls columns list from df_2
In [14]:
va = pd.DataFrame(data=info, columns = columns) # creates dataframe from extracted audio file features
In [15]:
# converts file name from full path to file, removes type(.wav)
for i in range(len(va)):
    va['file_name'][i] = va['file_name'][i][53:-4]
In [16]:
df['note_str'] = df['note_str'].astype(str) # converts note type to string
In [17]:
samples = df.merge(va,left_on='note_str', right_on='file_name',how='right') # combines extracted audio features with audio sample information dataframe
In [18]:
samples['instrument_family_str'].value_counts() # views count of unique instruments
Out[18]:
bass        2635
keyboard    2402
guitar      2070
organ       1598
brass        886
string       814
reed         720
mallet       663
flute        470
vocal        404
Name: instrument_family_str, dtype: int64
In [19]:
len(samples)
Out[19]:
12662
In [20]:
# plots wave shape of audio file
plt.figure(figsize=(20,4))
x, sr = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[8512])
librosa.display.waveshow(y=x,sr=sr)
plt.title("Mallet Audio Wave")
plt.xlim(-0.1,2);
plt.savefig('mallet_wave.png')
In [21]:
# plots spectrogram (frequency vs time) of audio file

y, sr = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[455])
fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True)
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
img = librosa.display.specshow(D, y_axis='linear', x_axis='time',sr=sr, ax=ax[0])
ax[0].set(title=dir_list[455])
ax[0].label_outer()
hop_length = 1024
D = librosa.amplitude_to_db(np.abs(librosa.stft(y, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D, y_axis='log', sr=sr, hop_length=hop_length,x_axis='time', ax=ax[1])
ax[1].set(title='Log-frequency power spectrogram')
ax[1].label_outer()
fig.colorbar(img, ax=ax, format="%+2.f dB", anchor =(3,1))
ax[0].set_xlim(0,2)
ax[1].set_xlim(0,2);
plt.savefig('bass_spec.png')
In [22]:
# plots spectrogram (frequency vs time) of audio files

y1, sr1 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[4002])
y2, sr2 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[455])
fig, axs = plt.subplots(nrows=2, ncols=2, sharex=False, figsize=(20,8))
fig.tight_layout(pad=5)
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y1)), ref=np.max)
img1 = librosa.display.specshow(D1, y_axis='linear', x_axis='time',sr=sr1, ax=axs[0,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y2)), ref=np.max)
img2 = librosa.display.specshow(D2, y_axis='linear', x_axis='time',sr=sr2, ax=axs[0,1])

axs[0,0].set_title(dir_list[4002], fontsize=20)
axs[0,1].set_title(dir_list[455], fontsize=20)
hop_length = 1024
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y1, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D1, y_axis='log', sr=sr1, hop_length=hop_length,x_axis='time', ax=axs[1,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y2, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D2, y_axis='log', sr=sr2, hop_length=hop_length,x_axis='time', ax=axs[1,1])

axs[1,0].set(title='Log-frequency power spectrogram')
axs[1,1].set(title='Log-frequency power spectrogram')
fig.colorbar(img, ax=axs, format="%+2.f dB",anchor =(2,1))
plt.setp(axs,xlim=(0,2));
plt.savefig('guitar_bass_spec.png')
In [23]:
# plots spectrogram (frequency vs time) of audio files

y3, sr3 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[8512])
y4, sr4 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[7481])
fig, axs = plt.subplots(nrows=2, ncols=2, sharex=False, figsize=(20,8))
fig.tight_layout(pad=5)
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y3)), ref=np.max)
img1 = librosa.display.specshow(D1, y_axis='linear', x_axis='time',sr=sr3, ax=axs[0,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y4)), ref=np.max)
img2 = librosa.display.specshow(D2, y_axis='linear', x_axis='time',sr=sr4, ax=axs[0,1])

axs[0,0].set_title(dir_list[8512], fontsize=20)
axs[0,1].set_title(dir_list[7481],fontsize=20)
hop_length = 1024

D3 = librosa.amplitude_to_db(np.abs(librosa.stft(y3, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D3, y_axis='log', sr=sr3, hop_length=hop_length,x_axis='time', ax=axs[1,0])
D4 = librosa.amplitude_to_db(np.abs(librosa.stft(y4, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D4, y_axis='log', sr=sr4, hop_length=hop_length,x_axis='time', ax=axs[1,1])
axs[1,0].set(title='Log-frequency power spectrogram')
axs[1,1].set(title='Log-frequency power spectrogram')

fig.colorbar(img, ax=axs, format="%+2.f dB", anchor =(2,1))
plt.setp(axs,xlim=(0,2));
plt.savefig('mallet_keyboard_spec.png')
In [24]:
# plots scatterplot of 2 audio features (spectral centriod and zero cross rate) with the hue representing pitch

plt.figure(figsize=(16,16), dpi=200)
sns.scatterplot(data=samples,x='spectral_centroids', y='zero_crossing_rate', hue = 'pitch',palette ='viridis')
plt.title('Zero Crossing Rate vs Spetral Centroids',fontdict={'fontsize': 24})
plt.xlabel('Spectral Centroids')
plt.ylabel('Zero Crossing Rate')
plt.savefig('cent_zero_cross_scatter.png')
In [25]:
# creates barplot of MFCC 3 by instrument

plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'mfcc3', palette = 'mako')
plt.xlabel('Instrument')
plt.ylabel('MFCC 3');
plt.title('MFCC 3 by Instrument')
plt.savefig('mfcc3_inst.png')
In [26]:
# creates barplot of MFCC 8 by instrument

plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'mfcc8', palette = 'mako')
plt.xlabel('Instrument')
plt.title('MFCC 8 by Instrument')
plt.ylabel('MFCC 8');
plt.savefig('mfcc8_inst.png')
In [27]:
# creates barplot of spectral bandwidth by instrument

plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'spectral_bandwidth_2', palette = 'mako')
plt.xlabel('Instrument')
plt.ylabel('spectral_bandwidth')
plt.title('Spectral Bandwidth by Instrument');
plt.savefig('spec_band.png')
In [28]:
#samples.to_csv('samples.csv')
In [29]:
# creates plot showing relationship between various audio features with the hue representing pitch

g = sns.pairplot(data=samples, vars = ['mfcc2','mfcc3','mfcc4','mfcc5','mfcc6','mfcc7','spectral_bandwidth_3'],hue='pitch', palette='viridis')
g.fig.suptitle("Pair Plot of Various Audio Features by Pitch", fontsize = 24, weight = 'bold', y = 1.02)
Out[29]:
Text(0.5, 1.02, 'Pair Plot of Various Audio Features by Pitch')
In [30]:
samples.head()
Out[30]:
Unnamed: 0 note_str sample_rate qualities_str instrument_source instrument_family_str instrument_family note instrument_source_str qualities ... mfcc_accelerate_16 mfcc17 mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19
0 bass_electronic_018-022-050 bass_electronic_018-022-050 16000 ['percussive'] 1 bass 0 277009 electronic [0, 0, 0, 0, 0, 0, 0, 1, 0, 0] ... 0.046337 -0.739872 -0.046959 0.051725 -0.290447 -0.082190 0.018868 2.694299 -0.056343 -0.000569
1 bass_electronic_018-022-127 bass_electronic_018-022-127 16000 ['fast_decay', 'percussive'] 1 bass 0 223304 electronic [0, 0, 0, 1, 0, 0, 0, 1, 0, 0] ... -0.275117 -0.969569 0.028864 -0.215973 -2.457191 -0.140676 0.338458 -0.499463 -0.884398 0.581262
2 bass_electronic_018-023-050 bass_electronic_018-023-050 16000 [] 1 bass 0 222626 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ... 0.013104 1.079908 -0.011627 0.000507 0.035780 0.013179 0.005179 -0.784336 0.031024 -0.011306
3 bass_electronic_018-023-100 bass_electronic_018-023-100 16000 [] 1 bass 0 230338 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ... 0.006131 1.711984 0.009815 0.006658 0.741615 -0.012899 0.003196 -0.122626 0.010039 -0.027255
4 bass_electronic_018-024-050 bass_electronic_018-024-050 16000 [] 1 bass 0 284868 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ... 0.010970 1.546056 0.016031 -0.002961 0.683596 -0.000470 0.006038 -0.126844 -0.017591 -0.018448

5 rows × 102 columns

In [31]:
# drops unnecessary columns from samples dataframe
samples = samples.drop(['note_str', 'instrument_source', 'instrument_family','sample_rate', 'qualities_str', 'file_name','instrument_source_str', 'qualities', 'Unnamed: 0','instrument_str', 'instrument','velocity','file_name','note'], axis = 1)
In [32]:
# removes notes where pitch is outside range of a standard keyboard

note=samples[samples['pitch'] > 21]
note=note[note['pitch']<108]
In [33]:
inst = note.drop('pitch', axis=1) # removes pitch for dataframe where instrument is predicting class
In [34]:
note=note.drop('instrument_family_str',axis=1) # removes instrument for dataframe where pitch is predicting class
In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Instrument Prediction¶

In [36]:
# sets X,y values for instrument prediction

X=inst.drop('instrument_family_str', axis=1)
y=inst['instrument_family_str']
In [37]:
scaler = StandardScaler() # creates instance of standard scaler
In [38]:
# splits dataframe for training / testing
X_train_inst, X_test_inst, y_train_inst, y_test_inst = train_test_split(X, y, test_size=0.15, random_state=101)
In [39]:
scaled_X_train_inst = scaler.fit_transform(X_train_inst) # fits scaler to training data, scales training data
scaled_X_test_inst = scaler.transform(X_test_inst) # scales test data

Logrithmic Regression

In [40]:
log_model=LogisticRegression(solver='saga', multi_class='ovr', max_iter=10000) # defines model to be used for training
In [41]:
# sets up parameters for grid search
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty})
In [42]:
grid_model.fit(scaled_X_train_inst,y_train_inst) # finds best parameters for model based on training data
Out[42]:
GridSearchCV(estimator=LogisticRegression(max_iter=10000, multi_class='ovr',
                                          solver='saga'),
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2']})
In [43]:
grid_model.best_params_ # displays best parameters from grid search
Out[43]:
{'C': 10000.0, 'penalty': 'l2'}
In [44]:
y_preds = grid_model.predict(scaled_X_test_inst) # predicts instrument name
In [45]:
from sklearn.metrics import classification_report, plot_confusion_matrix
In [46]:
print(classification_report(y_preds,y_test_inst)) # displays classification report
              precision    recall  f1-score   support

        bass       0.85      0.79      0.82       407
       brass       0.91      0.81      0.86       143
       flute       0.71      0.79      0.75        78
      guitar       0.75      0.67      0.71       316
    keyboard       0.72      0.78      0.75       335
      mallet       0.65      0.86      0.74        76
       organ       0.90      0.91      0.91       228
        reed       0.93      0.96      0.94        95
      string       0.81      0.86      0.83       128
       vocal       1.00      1.00      1.00        50

    accuracy                           0.81      1856
   macro avg       0.82      0.84      0.83      1856
weighted avg       0.81      0.81      0.81      1856

In [47]:
# plots confusion matrix for instrument name

plt.figure(figsize=(14,14))
plot_confusion_matrix(grid_model,scaled_X_test_inst,y_test_inst)
plt.xticks(rotation=90);
<Figure size 1008x1008 with 0 Axes>

K Nearest Neighbors

In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
In [49]:
# determines error rate for KNN model with various K values to determine optimal K for prediction

test_error_rates = []

for k in range(1,100):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(scaled_X_train_inst,y_train_inst) 
   
    y_pred_test = knn_model.predict(scaled_X_test_inst)
    
    test_error = 1 - accuracy_score(y_test_inst,y_pred_test)
    test_error_rates.append(test_error)
In [50]:
# plots K value vs error rate

plt.figure(figsize=(10,6),dpi=200)
plt.plot(range(1,100),test_error_rates,label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")
plt.title('K Value vs Error Rate');
plt.savefig('k_val_inst.jpg')
In [51]:
KNN_model = KNeighborsClassifier(n_neighbors=1) # initiates model with K = 1
In [52]:
KNN_model.fit(scaled_X_train_inst,y_train_inst)  # fits model to training data
   
y_pred_test = KNN_model.predict(scaled_X_test_inst) # predicts instrument name
In [53]:
print(classification_report(y_pred_test,y_test_inst)) # displays classification report for KNN
              precision    recall  f1-score   support

        bass       0.99      0.95      0.97       397
       brass       0.97      0.98      0.98       126
       flute       1.00      0.98      0.99        89
      guitar       0.95      0.98      0.96       276
    keyboard       0.97      0.99      0.98       356
      mallet       0.97      0.99      0.98        98
       organ       0.97      1.00      0.99       225
        reed       0.99      0.99      0.99        98
      string       1.00      0.96      0.98       141
       vocal       1.00      1.00      1.00        50

    accuracy                           0.98      1856
   macro avg       0.98      0.98      0.98      1856
weighted avg       0.98      0.98      0.98      1856

Random Forest

In [54]:
from sklearn.ensemble import RandomForestClassifier
In [55]:
rand_model = RandomForestClassifier() # initiates random forest model
In [56]:
rand_model.fit(scaled_X_train_inst,y_train_inst) # fits model to training data
Out[56]:
RandomForestClassifier()
In [57]:
preds_inst = rand_model.predict(scaled_X_test_inst) # predicts instrument name
In [58]:
print(classification_report(preds_inst,y_test_inst)) # displays classification report for random forest
              precision    recall  f1-score   support

        bass       1.00      0.99      1.00       383
       brass       1.00      1.00      1.00       128
       flute       1.00      1.00      1.00        87
      guitar       0.99      1.00      0.99       281
    keyboard       1.00      1.00      1.00       362
      mallet       1.00      1.00      1.00       100
       organ       1.00      1.00      1.00       231
        reed       1.00      1.00      1.00        98
      string       1.00      1.00      1.00       136
       vocal       1.00      1.00      1.00        50

    accuracy                           1.00      1856
   macro avg       1.00      1.00      1.00      1856
weighted avg       1.00      1.00      1.00      1856

In [59]:
# displays confusion matrix for random forest
plot_confusion_matrix(rand_model,scaled_X_test_inst,y_test_inst)
plt.xticks(rotation = 90);

Pitch Prediction¶

In [60]:
note.head()
Out[60]:
pitch zero_crossing_rate zero_crossings spectrogram mel_spectrogram harmonics perceptual_shock_wave spectral_centroids spectral_centroids_delta spectral_centroids_accelerate ... mfcc_accelerate_16 mfcc17 mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19
0 22 0.210125 15742 -66.413757 -77.642357 0.000006 -0.000673 2671.302747 22.499443 1.726520 ... 0.046337 -0.739872 -0.046959 0.051725 -0.290447 -0.082190 0.018868 2.694299 -0.056343 -0.000569
1 22 0.062500 459 -35.409962 -52.084045 0.001413 -0.001678 667.165083 82.330465 83.957916 ... -0.275117 -0.969569 0.028864 -0.215973 -2.457191 -0.140676 0.338458 -0.499463 -0.884398 0.581262
2 23 0.008180 589 -25.864546 -30.828018 0.000110 0.000114 129.129320 7.923706 3.023670 ... 0.013104 1.079908 -0.011627 0.000507 0.035780 0.013179 0.005179 -0.784336 0.031024 -0.011306
3 23 0.007953 562 -27.519205 -33.968998 0.000027 0.000153 120.454945 5.007528 4.103833 ... 0.006131 1.711984 0.009815 0.006658 0.741615 -0.012899 0.003196 -0.122626 0.010039 -0.027255
4 24 0.009085 646 -26.927103 -31.652479 0.000025 0.000088 127.136842 6.714991 3.042447 ... 0.010970 1.546056 0.016031 -0.002961 0.683596 -0.000470 0.006038 -0.126844 -0.017591 -0.018448

5 rows × 88 columns

In [61]:
# sets X and y values for note preidiction

X = note.drop('pitch', axis=1)
y=note['pitch']
In [62]:
# splits data in train, test sets
X_train_note, X_test_note, y_train_note, y_test_note = train_test_split(X, y, test_size=0.15, random_state=101)
In [63]:
rand_model2 = RandomForestClassifier() # initiates random forest model
In [64]:
rand_model2.fit(X_train_note,y_train_note) # fits model to training data
Out[64]:
RandomForestClassifier()
In [65]:
preds = rand_model2.predict(X_test_note) # predicts pitch
In [66]:
print(classification_report(preds,y_test_note)) # displays classification report random forest
              precision    recall  f1-score   support

          22       0.67      0.67      0.67        12
          23       0.57      0.80      0.67        15
          24       0.65      0.79      0.71        19
          25       0.84      0.80      0.82        20
          26       0.92      0.73      0.81        15
          27       0.92      1.00      0.96        24
          28       0.73      0.79      0.76        14
          29       0.90      0.82      0.86        33
          30       1.00      0.65      0.79        20
          31       0.90      0.90      0.90        20
          32       0.90      0.88      0.89        32
          33       0.85      0.81      0.83        27
          34       0.96      0.96      0.96        26
          35       0.71      0.86      0.77        14
          36       0.80      0.73      0.76        22
          37       0.93      0.90      0.91        29
          38       1.00      0.94      0.97        34
          39       0.83      0.96      0.89        25
          40       0.92      0.86      0.89        28
          41       0.87      0.87      0.87        30
          42       0.79      0.88      0.84        26
          43       0.88      0.88      0.88        25
          44       0.87      0.91      0.89        22
          45       0.80      0.91      0.85        22
          46       0.93      0.89      0.91        28
          47       1.00      0.74      0.85        23
          48       0.95      0.86      0.90        21
          49       0.82      0.88      0.85        26
          50       0.94      1.00      0.97        31
          51       0.97      0.97      0.97        34
          52       0.94      0.94      0.94        32
          53       0.88      0.96      0.92        23
          54       0.97      0.97      0.97        32
          55       0.97      0.97      0.97        34
          56       0.97      0.89      0.93        38
          57       0.93      1.00      0.96        26
          58       0.80      1.00      0.89        20
          59       1.00      0.86      0.92        28
          60       0.97      0.91      0.94        35
          61       0.96      1.00      0.98        22
          62       1.00      0.96      0.98        23
          63       1.00      1.00      1.00        28
          64       1.00      1.00      1.00        25
          65       1.00      1.00      1.00        19
          66       0.96      1.00      0.98        23
          67       1.00      0.96      0.98        23
          68       0.89      1.00      0.94        25
          69       1.00      1.00      1.00        18
          70       1.00      0.96      0.98        24
          71       1.00      0.83      0.91        18
          72       1.00      1.00      1.00        20
          73       1.00      1.00      1.00        15
          74       0.95      1.00      0.97        18
          75       1.00      0.92      0.96        25
          76       0.96      0.96      0.96        25
          77       0.94      1.00      0.97        16
          78       1.00      1.00      1.00        19
          79       1.00      1.00      1.00        27
          80       1.00      0.94      0.97        18
          81       1.00      0.96      0.98        24
          82       0.94      1.00      0.97        16
          83       0.86      0.95      0.90        20
          84       1.00      1.00      1.00        25
          85       0.88      0.93      0.90        15
          86       1.00      0.94      0.97        18
          87       1.00      0.95      0.98        22
          88       0.92      1.00      0.96        22
          89       0.95      0.95      0.95        22
          90       1.00      0.91      0.95        11
          91       0.83      0.94      0.88        16
          92       0.86      0.92      0.89        13
          93       0.94      0.94      0.94        17
          94       0.86      0.92      0.89        13
          95       0.87      0.93      0.90        14
          96       0.93      0.88      0.90        16
          97       0.87      0.93      0.90        14
          98       0.88      1.00      0.94        15
          99       1.00      0.83      0.91        12
         100       0.77      1.00      0.87        10
         101       1.00      1.00      1.00        14
         102       0.87      1.00      0.93        13
         103       1.00      0.46      0.63        24
         104       0.92      0.92      0.92        12
         105       0.81      0.93      0.87        14
         106       0.82      0.93      0.87        15
         107       0.92      0.92      0.92        13

    accuracy                           0.92      1856
   macro avg       0.91      0.92      0.91      1856
weighted avg       0.92      0.92      0.92      1856

In [67]:
param_grid = {"n_estimators":[100,150,200,250],'max_depth':[6,10,14,20,25]} # defines parameter values for grid search
In [68]:
rand_model3 = RandomForestClassifier() # initiates random forest model
In [69]:
grid = GridSearchCV(rand_model3,param_grid) # sets up grid search
In [70]:
grid.fit(X_train_note,y_train_note) # performs grid search 
Out[70]:
GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 10, 14, 20, 25],
                         'n_estimators': [100, 150, 200, 250]})
In [71]:
grid.best_params_ # displays best parameters based on training data
Out[71]:
{'max_depth': 25, 'n_estimators': 200}
In [72]:
preds = grid.predict(X_test_note) # predicts pitch
In [73]:
print(classification_report(preds,y_test_note)) # displays classification report random forest - grid
              precision    recall  f1-score   support

          22       0.67      0.53      0.59        15
          23       0.57      0.75      0.65        16
          24       0.61      0.82      0.70        17
          25       0.84      0.84      0.84        19
          26       0.92      0.69      0.79        16
          27       0.88      0.96      0.92        24
          28       0.80      0.80      0.80        15
          29       0.83      0.83      0.83        30
          30       1.00      0.81      0.90        16
          31       0.90      1.00      0.95        18
          32       0.90      0.85      0.88        33
          33       0.85      0.81      0.83        27
          34       0.88      1.00      0.94        23
          35       0.65      0.73      0.69        15
          36       0.75      0.65      0.70        23
          37       0.93      0.96      0.95        27
          38       0.97      0.89      0.93        35
          39       0.83      0.92      0.87        26
          40       0.96      0.86      0.91        29
          41       0.87      0.93      0.90        28
          42       0.90      0.90      0.90        29
          43       0.96      0.92      0.94        26
          44       0.87      0.91      0.89        22
          45       0.80      0.91      0.85        22
          46       1.00      0.87      0.93        31
          47       1.00      0.77      0.87        22
          48       0.89      0.85      0.87        20
          49       0.82      0.96      0.88        24
          50       0.94      0.97      0.95        32
          51       1.00      0.97      0.99        35
          52       0.94      0.91      0.92        33
          53       0.84      1.00      0.91        21
          54       0.94      0.97      0.95        31
          55       0.97      0.97      0.97        34
          56       0.97      0.89      0.93        38
          57       0.93      1.00      0.96        26
          58       0.84      1.00      0.91        21
          59       1.00      0.89      0.94        27
          60       0.97      0.89      0.93        36
          61       1.00      0.88      0.94        26
          62       1.00      1.00      1.00        22
          63       1.00      1.00      1.00        28
          64       0.96      1.00      0.98        24
          65       1.00      1.00      1.00        19
          66       0.96      1.00      0.98        23
          67       0.95      1.00      0.98        21
          68       0.89      1.00      0.94        25
          69       1.00      1.00      1.00        18
          70       1.00      0.88      0.94        26
          71       0.93      0.82      0.87        17
          72       0.95      1.00      0.97        19
          73       1.00      1.00      1.00        15
          74       1.00      1.00      1.00        19
          75       1.00      0.96      0.98        24
          76       0.96      1.00      0.98        24
          77       1.00      1.00      1.00        17
          78       1.00      1.00      1.00        19
          79       1.00      0.96      0.98        28
          80       1.00      0.94      0.97        18
          81       1.00      0.96      0.98        24
          82       0.94      1.00      0.97        16
          83       0.86      0.95      0.90        20
          84       1.00      0.93      0.96        27
          85       0.88      0.93      0.90        15
          86       0.94      0.94      0.94        17
          87       1.00      0.95      0.98        22
          88       0.92      1.00      0.96        22
          89       0.95      1.00      0.98        21
          90       1.00      0.83      0.91        12
          91       0.83      0.94      0.88        16
          92       0.86      0.92      0.89        13
          93       0.94      0.94      0.94        17
          94       0.86      1.00      0.92        12
          95       0.87      1.00      0.93        13
          96       1.00      0.83      0.91        18
          97       0.80      0.92      0.86        13
          98       0.88      1.00      0.94        15
          99       1.00      0.91      0.95        11
         100       0.85      1.00      0.92        11
         101       1.00      1.00      1.00        14
         102       0.93      1.00      0.97        14
         103       0.91      0.43      0.59        23
         104       0.92      0.92      0.92        12
         105       0.81      0.93      0.87        14
         106       0.82      0.88      0.85        16
         107       0.92      0.86      0.89        14

    accuracy                           0.92      1856
   macro avg       0.91      0.91      0.91      1856
weighted avg       0.92      0.92      0.91      1856

In [74]:
X_test_note['Note'] = preds # adds note prediction to test set
In [75]:
X_test_note['Instrument'] = preds_inst # adds instrument prediction to test set
In [76]:
X_test_note.head()
Out[76]:
zero_crossing_rate zero_crossings spectrogram mel_spectrogram harmonics perceptual_shock_wave spectral_centroids spectral_centroids_delta spectral_centroids_accelerate chroma1 ... mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19 Note Instrument
7829 0.007898 581 -43.719860 -77.248512 -1.015214e-05 0.000017 226.374534 5.361960 2.498784 0.222539 ... 0.050393 -0.021422 -2.508775 0.052334 -0.014904 -2.337551 0.038754 -0.007967 29 keyboard
2043 0.017844 1355 -58.730682 -77.676529 2.947440e-05 -0.000027 202.495059 3.167869 1.487649 0.008804 ... -0.051306 0.028265 -11.960894 -0.041612 0.029439 -12.619340 -0.012467 0.019915 43 bass
8896 0.013378 1185 -58.515682 -79.649185 3.395107e-05 -0.000033 205.533172 6.573461 2.840984 0.501567 ... 0.075102 0.020115 -18.010260 0.071818 0.034119 -16.802776 0.003094 0.043800 46 mallet
1830 0.003512 267 -15.204553 -43.861546 -1.311943e-02 0.000019 79.184042 2.821670 5.462707 0.746378 ... 0.035672 -0.006857 9.487486 0.042451 -0.014337 9.742423 -0.120129 -0.019815 77 bass
12077 0.018680 1650 -36.952751 -68.855492 5.404165e-07 0.000111 444.422995 4.781714 2.502548 0.067997 ... 0.005530 -0.016946 -5.744551 0.008136 -0.012692 -4.648269 0.050792 -0.009209 39 string

5 rows × 89 columns

In [77]:
keyboard = X_test_note[X_test_note['Instrument'] == 'keyboard'] # filters test set to only include keyboard instrument
In [78]:
staff = keyboard[(keyboard['Note'] < 80) & (keyboard['Note'] > 40)] # filters pitch range to 40-80
In [79]:
from music21 import *
In [80]:
staff.head()
Out[80]:
zero_crossing_rate zero_crossings spectrogram mel_spectrogram harmonics perceptual_shock_wave spectral_centroids spectral_centroids_delta spectral_centroids_accelerate chroma1 ... mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19 Note Instrument
7993 0.049649 3398 -77.101357 -79.964249 -0.000012 0.000011 672.998898 3.216142 2.635000 0.005937 ... 0.116904 -0.019846 -14.228360 0.085294 0.095099 -13.402055 0.019591 0.112221 70 keyboard
7721 0.075480 5210 -68.871651 -79.879707 -0.000010 -0.000015 1049.140166 -7.853725 -1.897551 0.086783 ... -0.096741 -0.113112 30.227087 -0.146796 -0.024512 -12.848338 0.271040 -0.012711 68 keyboard
7744 0.159245 10981 -69.856911 -79.926369 -0.000006 -0.000023 1805.684493 -9.533681 -3.812219 0.016535 ... -0.039040 -0.090040 -12.432023 0.062077 0.020225 -31.255440 0.103454 0.199113 75 keyboard
8263 0.024276 602 -62.082813 -80.000000 -0.000433 -0.000147 371.681972 -2.283279 -1.647439 0.051801 ... -0.004512 0.109875 -3.915097 0.001845 0.050360 -6.831211 0.063688 0.099692 44 keyboard
8301 0.035381 932 -68.732872 -79.963409 0.000023 -0.000119 537.949450 4.287896 4.590449 0.013967 ... -0.055562 -0.217188 0.244699 -0.080286 -0.031404 -3.288170 -0.121592 0.036604 54 keyboard

5 rows × 89 columns

In [81]:
# deines function to convert midi number to note name

NOTES = ['c', 'c#', 'd', 'd#', 'e', 'f', 'f#', 'g', 'g#', 'a', 'a#', 'b']
OCTAVES = list(range(11))
NOTES_IN_OCTAVE = len(NOTES)

def number_to_note(number: int) -> tuple:
    octave = number // NOTES_IN_OCTAVE
    assert octave in OCTAVES
    assert 0 <= number <= 127
    note = NOTES[number % NOTES_IN_OCTAVE]

    return note+str(octave)
In [82]:
# converts midi number predictions to named notes

notes = '1/4 '
for i in range(len(staff)):
    note = number_to_note(staff['Note'].values[i])
    notes = notes+' '+note+' '
In [83]:
test_staff = converter.parse("tinyNotation:"+notes) # plots notes on staff
In [84]:
test_staff.show() # displays staff